Author

Danyili Hong

Code
import pandas as pd
import altair as alt
alt.data_transformers.enable("vegafusion")

url = "https://calvin-data304.netlify.app/data/wvs.csv"
df = pd.read_csv(url)
print(df.head())
   sex  birth_year  birth_country_iso  age  age6  age3  married  \
0    2      1975.0               9999   43     3     2        1   
1    1      1957.0                 36   60     5     3        1   
2    1      1977.0               9999   41     3     2        1   
3    2      1974.0               9999   43     3     2        1   
4    2      1970.0               9999   48     4     2        1   

   married_before country  COW_NUM COW_ALPHA  democracy_importance  \
0              -4     AUS      900       AUL                     9   
1              -4     AUS      900       AUL                    10   
2              -4     AUS      900       AUL                     6   
3              -4     AUS      900       AUL                     9   
4              -4     AUS      900       AUL                    10   

   wave_chronology  ISO_country  S004  respondent_number_orig  \
0                7           36    -4                36071236   
1                7           36    -4                36070000   
2                7           36    -4                36070001   
3                7           36    -4                36070002   
4                7           36    -4                36070003   

   respondent_number_unified    weight  weight_equilibrated  survey_year  
0                   36720001  1.010623             0.551572         2018  
1                   36720002  0.651305             0.551572         2018  
2                   36720003  1.116451             0.551572         2018  
3                   36720004  0.591649             0.551572         2018  
4                   36720005  1.589662             0.551572         2018  
Code
country_counts = df['country'].value_counts().reset_index()
country_counts.columns = ['country', 'Number of Respondents']

sorted_countries = country_counts.sort_values(by='Number of Respondents', ascending=False)

chart = alt.Chart(sorted_countries).mark_bar().encode(
    x=alt.X('country:N', sort=alt.EncodingSortField(field='Number of Respondents', op='sum', order='descending')),
    y='Number of Respondents:Q',
    color=alt.value('steelblue')
).properties(
    width=800,
    title='Number of Respondents in Each Country'
)

chart.configure_axis(
    labelFontSize=12,
    titleFontSize=14
).configure_title(
    fontSize=16
).configure_legend(
    titleFontSize=14,
    labelFontSize=12
).configure_header(
    titleFontSize=16,
    labelFontSize=14
).configure_title(
    anchor='start'
)

chart

There is a notable difference among the countries.

Code
print("Unique values in age3:", df['age3'].unique())

chart1 = alt.Chart(df).mark_bar().encode(
    x='age3',
    y='count()',
    color='country'
).properties(
    title="Age Groups (3 categories) by Country"
)



chart1
Unique values in age3: [2 3 1]
Code
print("Unique values in age6:", df['age6'].unique())

chart2 = alt.Chart(df).mark_bar().encode(
    x='age6',
    y='count()',
    color='country'
).properties(
    title="Age Groups (6 categories) by Country"
)

chart2
Unique values in age6: [3 5 4 2 6 1]
Code
print("Unique values in age6:", df['age6'].unique())

chart2 = alt.Chart(df).mark_line(point=True).encode(
    x='age6:N',
    y='count()',
    color='country:N'
).properties(
    title="Age Groups (6 categories) by Country"
).facet(
    facet='country:N'
)

chart2
Unique values in age6: [3 5 4 2 6 1]
Code
print("Unique values in age6:", df['age6'].unique())

chart2 = alt.Chart(df).mark_line().encode(
    x='age6:N',
    y=alt.Y('average(democracy_importance):Q', title='Average democracy importance'),
    color='country:N'
).properties(
    title="Average Age Groups (6 categories) by Country"
).facet(
    facet='country:N'
)

chart2
Unique values in age6: [3 5 4 2 6 1]
Code
print("Unique values in age:", df['age'].unique())

chart2 = alt.Chart(df).mark_line().encode(
    x='age:N',
    y=alt.Y('average(democracy_importance):Q', title='Average democracy importance'),
    color='country:N'
).properties(
    title="Average Age Groups by Country"
).facet(
    facet='country:N'
)

chart2
Unique values in age: [43 60 41 48 57 25 62 78 44 83 70 64 47 54 61 36 77 50 20 33 45 32 34 52
 59 35 39 26 29 53 28 55 58 74 69 67 72 49 31 93 40 18 85 68 71 66 79 63
 56 80 30 21 42 51 73 22 76 82 75 37 27 81 46 84 86 38 65 23 19 87 89 94
 24 90 88 91 92 17 98 96 95]

It’s worse. Hard to see at one sight.

Code
# Linear Regression
linear_reg = alt.Chart(df).mark_point().encode(
    x='age',
    y='weight'
).properties(
    title='Scatter Plot with Linear Regression'
)

linear_reg += linear_reg.transform_regression('age', 'weight').mark_line()

# Polynomial Regression
poly_reg = alt.Chart(df).mark_point().encode(
    x='age',
    y='weight'
).properties(
    title='Scatter Plot with Polynomial Regression'
)

poly_reg += poly_reg.transform_regression('age', 'weight', order=3).mark_line()

linear_reg | poly_reg